import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
plt.style.use('ggplot')
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
OR_df = pd.read_excel("online_retail_II.xlsx", )
OR_df.head()
OR_df.info()
We check the completion, find that some CustomerID record missing value. Becasue this analyst is aiming at customer, so we remove this part.
df = OR_df[OR_df['Customer ID'].notna()].reset_index(drop=True)
df.rename(columns={'Customer ID': 'CustomerID'}, inplace=True)
df.info()
# delete test records
df = df[~df['Description'].str.contains('test')]
# dont consider returns
df = df[df['Quantity']>0]
Invoice Date is string which is not convient for analysis later. Transform data type to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Invoice'] = df['Invoice'].astype('int')
Set colomn: Month, Day, TotalPrice(price*amount)
df['Month'] = df.InvoiceDate.values.astype('datetime64[M]')
# df['Day'] = df.InvoiceDate.dt.day
df['TotalPrice'] = df['Quantity']*df['Price']
df['Month'] = df['Month'].astype('str').apply(lambda x: x[:-3])
df.head()
monthly_amount = df.groupby('Month')['TotalPrice'].sum()
monthly_amount.head(5)
plt.figure(figsize=(10, 5))
plt.plot(monthly_amount)
plt.title('Monthly Total Amount')
The amount rise, reach the peak in November, then fell back
monthly_inv = df.groupby('Month')['Invoice'].nunique()
monthly_inv.head()
plt.figure(figsize=(10,5))
plt.plot(monthly_inv)
plt.title('Monthly Invoice Number')
monthly_sales = df.groupby('Month')['Quantity'].sum()
monthly_sales.head()
plt.figure(figsize=(10,5))
plt.plot(monthly_sales)
plt.title('Monthly Sales of Product')
Purchase time has a positive relationship with amout, which is normal. Why this trend happens? There might be promotion. Another significant reason is Black Friday.
monthly_consumers = df.groupby('Month')['CustomerID'].nunique()
monthly_consumers.head()
plt.figure(figsize=(10, 5))
plt.plot(monthly_consumers)
plt.title('Monthly Consumers')
For every month, number of consumers is less than purchase times, which means the Rate of Buyer Retention is not bad. First three seasons, number of consumers is between 1100-1700. The last season the number rised, peaking at 2630.
# show: each month/how many times each cumtomer purchase 每月/每人/消费次数
# df_monthly_c_t = df.groupby(['Month', 'CustomerID'])['Invoice'].count()
avg_purch_monthly = df.groupby(['Month'])['Quantity'].sum()/df.groupby(['Month'])['CustomerID'].nunique()
avg_purch_monthly.head()
plt.figure(figsize=(10, 5))
plt.plot(avg_purch_monthly)
plt.title('Average Number of Items Purchase of Each Month')
df_ind = df.groupby('CustomerID').agg({'Invoice':'nunique', 'Quantity':'sum','TotalPrice':'sum'})
df_ind.rename(columns={'Invoice':'Visit', 'Quantity':'Sales', 'TotalPrice':'TotalAmount'}, inplace=True)
df_ind.describe()
plt.figure(figsize=(15,10))
plt.scatter(x=df_ind['Visit'], y=df_ind['TotalAmount'], alpha=0.3)
plt.title('Average')
plt.xlabel('Visit')
plt.ylabel('Amount')
According to scatter plot, majority of points has centrolized distribution. Other points might be wholesalers.
According to Chebyshev's Inequality, in this case, numbers of amount that over £8744 less than 12%, the upper bound is 8744*3+2014 = 28246. (Setting lim as 15000 is more convient.)
plt.figure(figsize=(10,5))
plt.hist(df_ind.query('TotalAmount<15000').TotalAmount, bins=40)
plt.xlabel('Count')
plt.ylabel('Amount')
plt.title('Consumption Amount')
This histogram has a long tail, most customer spend less than £2000.
同一张发票, 相同时间购买, 视作消费一次
# 先根据Invoice去重,认为一张发票代表一次Visit, Cycle时间间隔: Visit Date - Last Visit Date, 如没有上一次访问, 那么返回NaT.
cycle = df.drop_duplicates(subset=['CustomerID', 'Invoice']).groupby('CustomerID').apply(lambda x:x.InvoiceDate-x.InvoiceDate.shift())
df_cycle = cycle.reset_index().rename(columns={'level_1':'Invoice'}).set_index('Invoice')
# 将时间差转换为天数
df_cycle['Period'] = df_cycle['InvoiceDate']/np.timedelta64(1, 'D')
# 半天内天多次Visit, 视作一次Visit
df_cycle = df_cycle[~(df_cycle['Period']<=0.5)]
df_cycle.describe()
plt.figure(figsize=(10, 5))
plt.hist(df_cycle['Period'], bins=30)
plt.xlabel('Time period')
plt.ylabel('Frequency')
plt.title('Consumption Cyle Distribution')
The average repurchase time period is 45 days. 75% of customer repurchase within 60 days. In order to imporve loyalty, doing something between 30-40 is good.
Lifetime: first-->last purchase
lifetime=0 means customer who only purchased one time.
df_csp_life = df.groupby('CustomerID')['InvoiceDate'].apply(lambda x: x.max()-x.min())
life = df_csp_life/np.timedelta64(1, 'D')
life.describe()
This is lifetime analysis including customer who only purchase once.
life_except_first = life[life>0]
life_except_first.describe()
This is lifetime analysis except customer who only purchase once.
plt.figure(figsize=(10, 5))
plt.hist(life_except_first, bins=80)
plt.xlabel('Lifetime')
plt.ylabel('Frequency')
plt.title('Customer Consumption Lifetime Distribution')
由图可知, 有大批客户仅消费一次, 部分用户集中在50-300天, 300天以后的, 属于高质量客户.
排除首次消费用户的影响, 平均生命周期是199.8天, 全部客户的平均生命周期是134.3天.
因此, 结合用户生命周期(4.4), 对首次消费的用户, 应当在30-40天内引导, 延长其消费生命周期.
Recency: days since last purchase Frequency: total purchased items / (or can be Visit Times) Monetary: total amount spent
rfm = df.pivot_table(values = ['InvoiceDate', 'Quantity', 'TotalPrice'],
index='CustomerID',
aggfunc={'InvoiceDate': 'max',
'Quantity': 'sum',
'TotalPrice': 'sum'})
rfm['InvoiceDate'] = (rfm.InvoiceDate.max() - rfm.InvoiceDate)/np.timedelta64(1, 'D')
rfm.rename(columns={'InvoiceDate':'R', 'Quantity':'F', 'TotalPrice':'M'}, inplace=True)
rfm.describe()
So far we design a RFM to evaluate customer. But we found that
rfm = rfm[rfm != 0].dropna() # 去掉为0的数据,
log_rfm = np.log(rfm)
log_rfm.describe()
import seaborn as sns
sns.set_style('ticks')
# pd.plotting.scatter_matrix(log_rfm, alpha = 0.3, figsize = (10,10), diagonal = 'kde')
sns.pairplot(log_rfm, height=5, diag_kind="kde")
我们使用KMeans聚类方法, 将客户分为8个类
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=8, random_state=0).fit(log_rfm)
label = kmeans.labels_
labeled_rfm = log_rfm.copy()
labeled_rfm['Label'] = label
labeled_rfm.head()
Customers are classified to 8 classes. According to features of each label, we evaluate different group of customer.
labeled_rfm.describe()
sns.pairplot(labeled_rfm, hue='Label', height=5)
通过Scatter Matrix, 我们发现,所有的用户已经成功的分为8个群体, 我们的目标是对这8个群体进行特征描述.
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(20,15))
ax = fig.add_subplot(111, projection='3d')
color_list = ['red','cyan','green','blue','crimson','yellowgreen','purple','brown']
for i in range(8):
g = labeled_rfm[labeled_rfm['Label']==(i)]
ax.scatter(g['R'], g['F'], g['M'], c=color_list[i])
Scores range from 1-5. For F and M, higher values are better, which has a higher weight. Recency is opposite, smaller value has higher weight.
quantiles = labeled_rfm.quantile(q=[0.2, 0.4, 0.6, 0.8]).to_dict()
def r_class(x, category, df):
"""
x: data before descretization.
category: R, F, M
df: dict of quantiles
return: score of pressent R/F/M value
"""
if x<=df[category][0.2]:
return 5
elif x<=df[category][0.4]:
return 4
elif x<=df[category][0.6]:
return 3
elif x<=df[category][0.8]:
return 2
else:
return 1
def fm_class(x, category, df):
if x<=df[category][0.2]:
return 1
elif x<=df[category][0.4]:
return 2
elif x<=df[category][0.6]:
return 3
elif x<=df[category][0.8]:
return 4
else:
return 5
"""
RFM Map to Score
"""
rfm_Seg = labeled_rfm.copy()
rfm_Seg['R'] = labeled_rfm['R'].apply(r_class, args=('R', quantiles))
rfm_Seg['F'] = labeled_rfm['F'].apply(fm_class, args=('F', quantiles))
rfm_Seg['M'] = labeled_rfm['M'].apply(fm_class, args=('M', quantiles))
rfm_Seg.head()
Combined with scatter, we can analyze the relation between Label and RFM value.
graph = sns.PairGrid(rfm_Seg,
x_vars = ['R', 'F', 'M', 'Label'],
y_vars = ['R', 'F', 'M', 'Label'],
hue='Label', height=3, palette="husl", hue_kws={'marker':["o", "s", "D", "o", "s", "D", "o", "s"]})
graph = graph.map_diag(plt.hist)
graph = graph.map_offdiag(sns.scatterplot, edgecolor='w', s=300, alpha=0.3)
graph.add_legend() # add color description
通过RFM值得特点, 结合Scatter Matrix, 对用户进行详细描述
label_avg_rfm = rfm_Seg.groupby('Label').agg({'R':'mean','F':'mean','M':'mean', 'Label':'count'})
label_avg_rfm.rename(columns={'Label':'Count'}, inplace=True)
label_avg_rfm = label_avg_rfm.round(2)
label_avg_rfm.describe()
label_avg_rfm
0: Champion: Bought recently, buy often and spend the most! 7: Loyal Customers: Spend good money with us often. Responsive to promotions.
5: Potential Loyalist: Recent customers, but spent a good amount and bought more than once.
3: Promising: Recent shoppers, but haven’t spent much.
2: Can’t Lose Them: Spent big money and purchased often. But long time ago. Need to bring them back!
1: Customers Needing Attention: Around average recency, frequency and monetary values. May not have bought very recently though.
4: Lost: Lowest recency, frequency and monetary scores. 6: Hibernating: Last purchase was long back, low spenders and bought seldomly.
label_names = {
0: 'Bought recently',
1: 'Customers Needing Attention',
2: 'Can’t Lose Them',
3: 'Promising',
4: 'Lost',
5: 'Potential Loyalist',
6: 'Hibernating',
7: 'Loyal Customers',
}
plt.figure(figsize=(5, 5))
plt.pie(label_avg_rfm['Count'].to_list(), labels=label_names.values(), autopct='%1.1f%%', radius=1, shadow=True, startangle=90)
plt.title('All Labels Numbers')
According to this pie chart, we can clearly know the percentage of each class.
For each month, if customer purchased, then the value is 1, else 0.
ind_monthly_visit = df.pivot_table(index='CustomerID', columns='Month', values='Invoice', aggfunc='nunique').fillna(0)
boolean_monthly_visit = ind_monthly_visit.applymap(lambda x: 1 if x>0 else 0)
boolean_monthly_visit.head()
from IPython.display import Image
Image('statusDecisionTree.jpg')
New: no purchase history before, and spend money this month
Inactive: purchased before, and didn't buy this month.
Return: purchased before but did't consume last month, this month buy again.
Unknown: no purchase history, and did't buy this month.
According to this decision tree, we can give a customer status for every month.
def check_status(record):
status = []
for i in range(13):
if record[i] == 1:
if len(status)==0:
s='New'
else:
if status[i-1] == 'Unknown':
s = 'New'
elif status[i-1] == 'Inactive':
s='Return'
else:
s = 'Active'
elif record[i] == 0:
if len(status)==0:
s='Unknown'
else:
if status[i-1] == 'Unknown':
s='Unknown'
else:
s='Inactive'
status.append(s)
return pd.Series(status, index=record.index.astype('str').to_list())
ind_status_df = boolean_monthly_visit.apply(check_status, axis=1)
ind_status_df.head()
status_df = ind_status_df.apply(lambda x: x.value_counts()).fillna(0)
status_df.drop(['Unknown'], inplace=True)
status_df
labels = ['Active', 'new', 'Return', 'Inactive']
x = status_df.columns.tolist()
plt.figure(figsize=(15, 5))
plt.stackplot(x, status_df.loc['Active'], status_df.loc['New'], status_df.loc['Return'], status_df.loc['Inactive'], labels=labels)
plt.legend(loc = 'upper left')
plt.xlabel('Month')
plt.ylabel('Count')
plt.title('Customer Status Stockplot')
New: Lacking of previous data, so in early time, there was alot new customers. After, the number is stable.
Active: From January, the number is stable, reaching the peak on November.
Return: The number rises slowly, also reach the peak on November.
Inactive: Rapid increase, which means a lot customers didn't repurchase often.
Multiple Purchase: Customer spent more than once in one month.
multi_df = ind_monthly_visit.applymap(lambda x: 1 if x>1 else np.NaN if x==0 else 0)
rates = multi_df.sum()/multi_df.count()
rates
plt.figure(figsize=(10, 5))
plt.plot(rates.index, rates)
plt.xlabel('Month')
plt.ylabel('Rate')
plt.title('Buyer Multiple Purchase Rate')
Multiple Purchase Rate was rising in this year. The lowest is 20%, maximun is 32%.
Repurchase: For customer who have spent month, they came again this month.
Image('repurchaseDecisionTree.jpeg')
def check_repurchase(record):
status = [np.NaN]
for i in range(1, 13):
if record[i] == 1:
if record[i-1]==1:
s = 1
else:
s = 0
else:
s = np.NaN
status.append(s)
return pd.Series(status, index=record.index.astype('str').tolist())
repurchase_df = boolean_monthly_visit.apply(check_repurchase, axis=1)
repurchase_df.head()
def purchase_back(record):
status = []
for i in range(12):
if record[i] == 1:
if record[i+1] == 1:
status.append(1)
if record[i+1] == 0:
status.append(0)
else:
status.append(np.NaN)
status.append(np.NaN)
return pd.Series(status, index=record.index.astype('str').to_list())
boolean_monthly_visit.apply(purchase_back, axis=1).sum()
repurchase_rate = repurchase_df.sum()/repurchase_df.count()
plt.figure(figsize=(10, 5))
plt.plot(repurchase_rate.index, repurchase_rate)
plt.xlabel('Month')
plt.ylabel('Rate')
plt.title('Buyer Repurchase Rate')
The annual repurchase rate is approximately 30% to 50%, and this repurchase rate is also high.
Use three attribute to describe customer: Toal quantity/visit/amount.
# For individual
ind_summary_df = df.pivot_table(index='CustomerID', values=['TotalPrice'], aggfunc={'TotalPrice':'sum'})
ind_summary_df['TotalQuantity'] = df.groupby(['CustomerID'])['Quantity'].sum()
ind_summary_df['TotalVisit'] = df.groupby(['CustomerID'])['Invoice'].nunique()
ind_summary_df['Label'] = labeled_rfm['Label'].astype('str')
# Group by Label
label_summary_df = ind_summary_df.groupby('Label').agg({'TotalPrice':'sum', 'TotalQuantity':'sum','TotalVisit':'sum', 'Label':'count'})
label_summary_df.rename(columns={'Label':'Count'}, inplace=True)
percentage_df = label_summary_df/label_summary_df.sum()
percentage_df
label_avg_df = label_summary_df[['Count']]
label_avg_df['Avg_Price'] = label_summary_df['TotalPrice']/label_summary_df['Count']
label_avg_df['Avg_Quantity'] = label_summary_df['TotalQuantity']/label_summary_df['Count']
label_avg_df['Avg_Visit'] = label_summary_df['TotalVisit']/label_summary_df['Count']
label_avg_df['Avg_Purchase_Value'] = label_avg_df['Avg_Price']/label_summary_df['TotalVisit']
label_avg_df.style.bar()
label_avg_rfm.style.bar(color='#33caff')
Champion: Bought recently, buy often and spend the most!
The number of such customers accounts for 0.06%, and the average amount of each purchase is high, they should be rewarded. They are willing to try to accept new products. Combined with the previous scatter plot, we can judge that it contains a part of wholesalers.
这类用户的数量占0.06%, 而且平均每次消费额都很高, 应当奖励他们, 也可以让他们尝试接受新的产品. 结合之前的散点图, 我们可以判断其中包含了一部分批发商.
Customers Needing Attention: Around average recency, frequency and monetary values. May not have bought very recently though.
Their proportion is 21.2%, which is the largest proportion of customers, but the amount of purchases is relatively low, indicating that the store is not attractive enough for them, and can provide a Limited Offer to re-engage them.
人数占比21.2%, 是占比最多的客户, 但是消费次数金额却比较低, 说明store对他们的吸引不够强大, 可以提供Limited Offer, 重新使他们产生兴趣
Can’t Lose Them: Spent big money and purchased often. But long time ago. Need to bring them back!
The proportion of this part of customers is 14.7%, and the total number of visits and total consumption is as high as 18%. They are very important customers. Their consumption amount and frequency are very high, but they have not appeared for a long time recently. Possible reasons: our products are not competitive, other stores attract this part of users. Provide newer and better products, research competitors' policies, and win back these customers.
这占比14.7, 而总访问次数和消费总额高达18%以上, 以前消费金额和频率都非常高, 但是最近很久没有出现过, 可能出现的原因: 厂家自身产品得不到认可, 其他商店将这部分用户吸引了. 可以提供更新更优质的产品, 研究竞争对手的操作, 将这部分客户赢回来.
Promising: Recent shoppers, but haven’t spent much.
The proportion of this part of customers is 15.4%. They have been here recently, but have n’t consumed a lot. We should build brand awareness and provide free trials.
是用户占比15.4%, 最近来过, 目前还没有消费很多, 应当建立品牌知名度, 提供免费试用等
Lost: Lowest recency, frequency and monetary scores.
The proportion of this part of users is very small and only 0.03%. They have been here long time ago. The RFM index is very low. They are low-value users. We can choose to attract them through promotion activities, otherwise give up.
这部分用户比重很小仅仅0.03%, 很久以前来过, RFM的指数都非常低, 是低价值的用户, 可以选择通过推广活动进行召回, 否则放弃
Potential Loyalist: Recent customers, but spent a good amount and bought more than once.
There are 16% of these customers. The average consumption amount and frequency are noticeable, but the average amount is lower than the overall average (1700 pounds). We should provide membership for these customers or the loyalty reward program, to improve their loyalty
这一部分客户有16%, 人均消费金额和次数都很多, 但是人均消费均额低于总体平均(1700磅)是为这一部分客户提供会员, 成立loyalty奖励项目
Hibernating: Last purchase was long back, low spenders and bought seldomly. This part of customers account for 17%, which is alot. The RFM index is very low. We can provide special discounts to attract them, and recreate brand value with pressenting relevant products.
Loyal Customers: Spend good money with us often. Responsive to promotions.
The proportion of customers is small (0.09%), but the average number of visit and consumption are much higher than ordinary users. They are customers that need attention.
这部分的客户比例比较少(0.09%), 但是购买次数和人均消费远高于普通用户, 是需要重视的客户, 可以适当提高
ind_status_df['Label'] = labeled_rfm['Label'].astype('str')
ind_status_df.head()
def get_status_rate(status):
"""
根据指定Status, 获取每个类在各个月份的占比
"""
status_label_count = pd.DataFrame()
for month in ind_status_df.columns[1:-1]:
result = ind_status_df.groupby('Label')[month].value_counts()
status_label_count[month] = result
for i in range(8):
status_label_count.T.loc['2010-01'][(str(i), status)]
label_specificStatus_df = pd.DataFrame()
for month in status_label_count.T.index:
# 该月, 8个类, 分别有多少是Inactive [171.0, 912.0, 633.0, 604.0, 143.0, 529.0, 747.0, nan]
label_inactive = pd.Series([status_label_count.T.loc[month][(str(i), status)] for i in range(8) ])
label_specificStatus_df[month] = label_inactive/label_inactive.sum()
return label_specificStatus_df
label_specificStatus_df = get_status_rate('Inactive')
label_specificStatus_df
labels = range(8)
x = label_inactive_df.columns.tolist()
plt.figure(figsize=(15, 5))
plt.stackplot(x, label_inactive_df.loc[0], label_inactive_df.loc[1], label_inactive_df.loc[2], label_inactive_df.loc[3], label_inactive_df.loc[4], label_inactive_df.loc[5], label_inactive_df.loc[6], label_inactive_df.loc[7], labels=labels)
plt.legend(loc = 'upper left')
plt.xlabel('Month')
plt.ylabel('Count')
plt.title('Customer Status Stockplot')
According to this stackplot, there are 3 group of customers have a high perccentage which are 1, 2, 6. e should pay attention on them. 根据这个图, 我们可以看出, 在inactive客户里, 1,2和6的比重非常高, 我们应当从着重提高这三部分的业绩.